Install necessary packages.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
if (!requireNamespace("GEOmetadb", quietly = TRUE))
BiocManager::install("GEOmetadb")
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
if (!requireNamespace("edgeR", quietly = TRUE))
BiocManager::install("edgeR")
if (!requireNamespace("ggplot2", quietly = TRUE))
install.packages("ggplot2")
if (!requireNamespace("reshape2", quietly = TRUE))
BiocManager::install("reshape2")
if (!requireNamespace("HGNChelper", quietly = TRUE))
BiocManager::install("HGNChelper")
if (!requireNamespace("limma", quietly = TRUE))
BiocManager::install("limma")
if (!requireNamespace("knitr", quietly = TRUE))
BiocManager::install("knitr")
if (!requireNamespace("ComplexHeatmap", quietly = TRUE))
install.packages("ComplexHeatmap")
if (!requireNamespace("circlize", quietly = TRUE))
install.packages("circlize")
if (!requireNamespace("Biobase", quietly = TRUE))
BiocManager::install("Biobase")
if (!requireNamespace("MEAL", quietly = TRUE))
BiocManager::install("MEAL")
Get the data for my chosen GEO id. I chose to work with the data set from an experiment that uses RNA-sequencing to study stem cell derived cortical excitatory (cExN) and cortical inhibitory (cIN) neurons from three individuals with varying affectation of autism syndrome disorder (ASD) (Lewis, Meganathan and Baldridge 2019).
myGEOID <- 'GSE129806'
suppFiles = GEOquery::getGEOSuppFiles(myGEOID)
Read the data in. There are 2 files provided one with the RPKM data and one with the raw counts data. I will work with the second file which has the counts data.
fileNames = rownames(suppFiles)
data <- read.delim(fileNames[2], check.names=FALSE, header=TRUE)
Check what the columns represent. They represent the cExN or cIN samples from the 4 individuals. “UC” prefix represents unaffected control group individual. “AP” prefix represents affected proband individual. “IS” prefix represents the intermediated phenotype sister. “UM” prefix respesents unaffected mother.
colnames(data)
## [1] "name" "length" "UC_cIN_R1" "UC_cIN_R2" "UC_cIN_R3"
## [6] "UC_cIN_R4" "AP_cIN_R1" "AP_cIN_R2" "AP_cIN_R3" "AP_cIN_R4"
## [11] "IS_cIN_R1" "IS_cIN_R2" "IS_cIN_R3" "IS_cIN_R4" "UM_cIN_R1"
## [16] "UM_cIN_R2" "UM_cIN_R3" "UM_cIN_R4" "UC_cExN_R1" "UC_cExN_R2"
## [21] "UC_cExN_R3" "UC_cExN_R4" "AP_cExN_R1" "AP_cExN_R2" "AP_cExN_R3"
## [26] "AP_cExN_R4" "IS_cExN_R1" "IS_cExN_R2" "IS_cExN_R3" "IS_cExN_R4"
## [31] "UM_cExN_R1" "UM_cExN_R2" "UM_cExN_R3" "UM_cExN_R4"
Rename column “name” to “gene_id” for more clarity.
colnames(data)[1] <- 'gene_id'
Remove any NAs.
na.omit(data)
See how many genes we have.
dim(data)
## [1] 56609 34
Take a look at the genes in the data set and check for duplicates.
sort(table(data$gene_id), decreasing = TRUE)[1:25]
##
## 1-Mar 2-Mar 1-Dec 1-Sep 10-Mar 10-Sep 11-Mar 11-Sep
## 2 2 1 1 1 1 1 1
## 12-Sep 14-Sep 2-Sep 3-Mar 3-Sep 4-Mar 4-Sep 5_8S_rRNA
## 1 1 1 1 1 1 1 1
## 5-Mar 5-Sep 5S_rRNA 6-Mar 6-Sep 7-Mar 7-Sep 7SK
## 1 1 1 1 1 1 1 1
## 8-Mar
## 1
Looking at the results, there are no duplicates. However, looks like there are gene symbols mistakenly converted to date by Excel. I will remove these dates as it’s unclear what genes they were converted from.
data <- data[!grepl('^[0-9]{1,2}[-][a-zA-Z]{3}$', data$gene_id),]
Check the row names to see what they are. Rename each row to the gene id.
rownames(data)[1:10]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
rownames(data) <- data$gene_id
Check if the symbols are all HGNC or if they are up to date using HGNCHelper.
replacements <- HGNChelper::checkGeneSymbols(data$gene_id)
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Warning in HGNChelper::checkGeneSymbols(data$gene_id): Human gene symbols should
## be all upper-case except for the 'orf' in open reading frames. The case of some
## letters was corrected.
## Warning in HGNChelper::checkGeneSymbols(data$gene_id): x contains non-approved
## gene symbols
# Update the gene id with the newest HGNC symbols if available
data$gene_id <- ifelse(!replacements$Approved & !is.na(replacements$Suggested.Symbol), replacements$Suggested.Symbol, replacements$x)
See how many of the remaining gene_ids could not be mapped to a HGNC symbol. Some of these include long non-coding RNAs and pseudogenes.
not_mapped <- length(which(!HGNChelper::checkGeneSymbols(data$gene_id)$Approved))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Warning in HGNChelper::checkGeneSymbols(data$gene_id): Human gene symbols should
## be all upper-case except for the 'orf' in open reading frames. The case of some
## letters was corrected.
## Warning in HGNChelper::checkGeneSymbols(data$gene_id): x contains non-approved
## gene symbols
In the experiment, there were at least 3 or more biological replicates for each subject. I will use n=3 to perform removal of features without at least 1 read per million in 3 of the samples. Use edgeR package to calculate CPM and remove low counts.
cpms = edgeR::cpm(data[,3:34])
rownames(cpms) <- data[,1]
keep = rowSums(cpms > 1) >= 3
data_filtered = data[keep,]
rows_removed = dim(data)[1]-dim(data_filtered)[1]
Check the number of genes now.
dim(data_filtered)
## [1] 18474 34
Create a grouping from the data set.
# Based on code from lecture 4
samples <- data.frame(lapply(colnames(data)[3:34], FUN=function(x){unlist(strsplit(x, split="_"))[c(1,2)]}))
colnames(samples) = colnames(data)[3:34]
rownames(samples) = c("patients", "cell_type")
samples <- data.frame(t(samples))
I will apply TMM to the data set. I chose to use this method because most of the genes were not differentially expressed and that the RNA seq data does not to be modified.
# Based on code from lecture 4
data_filtered_matrix <- as.matrix(data_filtered[,3:34])
rownames(data_filtered_matrix) <- data_filtered$gene_id
d = edgeR::DGEList(counts=data_filtered_matrix, group=samples$cell_type)
Calculate the normalization factor and get it.
# Based on code from lecture 4
d = edgeR::calcNormFactors(d)
normalized_counts <- edgeR::cpm(d)
Modify the regular data set for plotting.
data_to_plot <- reshape2::melt((edgeR::cpm(data_filtered[,3:34], log=TRUE)))
colnames(data_to_plot) <- c("gene_id", "sample", "cpm")
Modify the normalized data set for plotting.
normalized_data_to_plot <- reshape2::melt(log2(normalized_counts))
colnames(normalized_data_to_plot) <- c("gene_id", "sample", "cpm")
Create a pre-normalized boxplot to visualize the data.
ggplot2::ggplot(data_to_plot, ggplot2::aes(x=sample, y=cpm)) +
ggplot2::geom_boxplot() +
ggplot2::theme(axis.text.x=ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggplot2::labs(title="Original cIN and cExN RNASeq Samples", y="log2-CPM")
Create a normalized boxplot to visualize the data.
ggplot2::ggplot(normalized_data_to_plot, ggplot2::aes(x=sample, y=cpm)) +
ggplot2::geom_boxplot() +
ggplot2::theme(axis.text.x=ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggplot2::labs(title="Normalized cIN and cExN RNASeq Samples", y="log2-CPM")
## Warning: Removed 1965 rows containing non-finite values (stat_boxplot).
Create a pre-normalized density plot to visualize the data.
ggplot2::ggplot(data_to_plot, ggplot2::aes(x=cpm, color=sample)) +
ggplot2::geom_density() +
ggplot2::labs(title="Original cIN and cExN RNASeq Samples", y="density of log2-CPM", x="")
Create a normalized density plot to visualize the data.
ggplot2::ggplot(normalized_data_to_plot, ggplot2::aes(x=cpm, color=sample)) +
ggplot2::geom_density() +
ggplot2::labs(title="Normalized cIN and cExN RNASeq Samples", y="density of log2-CPM", x="")
## Warning: Removed 1965 rows containing non-finite values (stat_density).
The density graph of the data set looks like it follows a bimodal distribution.
Looking at the data set and the paper, the test condition is the cortical excitatory (cExN) and cortical inhibitory (cIN) neurons from the three first-degree relatives with absent, intermediate and severe expressivity of autism syndrome disorder (ASD). The control condition is the cExN and cIN neurons from the unrelated individual. The samples of the three first-degree relatives are prefixed with UM (Unaffected mother), IS (Intermediated Sister) and AP (Affected proband). The sample of the unrelated individual is prefixed with UC (Unrelated, unaffected control).
This data set is interesting to me because I am currently working in a lab that does research related to autism spectrum disorder.
There weren’t multiple expression values that mapped to a single gene.
Yes there were 19865 expression values that could not be mapped to a current HUGO symbol.
38108 outliers were removed.
Excluding the control cell line, there were 3 biological replicates from each of the two clonal lines for each test subject. For now, I decided to keep the replicates separate and analyze them independently.
There was approximately 30 million reads per sample according to the paper (Lewis, Meganathan and Baldridge 2019)
The purpose of this section is to conduct differential expression analysis and rank genes with the normalized expression set from assignment 1.
In assignment 1, I downloaded the data set of interest. The data set is from an experiment that uses RNA-sequencing to study stem cell derived cortical excitatory (cExN) and cortical inhibitory (cIN) neurons from three individuals with varying affectation of autism syndrome disorder (ASD) (Lewis, Meganathan and Baldridge 2019). To explore the data, the null values, mistakes (dates instead of genes) and outliers were removed. The data was then normalized using TMM normalization. The pre-normalization and post-normalization data was graphed out using boxplots and density curves.
From the MDS plot we can see two things. One is the significant dissimilarity between the two cell types. In dark green we have the cortical inhibitory neuron cells and in the blue we have the cortical excitatory neuron cells. As well we can see the that both cell types of the affected proband (AP) are clustered near the top. Both cell types of the mother (UM) and sibling (IS) of the affected proband are clustered near the middle and the unaffected control cell types are clustered near the bottom. Since we’re interested in seeing which genes may be attributed to autism syndrome disorder, we’ll focus our analysis on which patient (UM, IS, AP or UC) the cells come from.
limma::plotMDS(d, labels=rownames(samples), col = c("darkgreen", "blue")[factor(samples$cell_type)])
### Loading normalized data and producing heat map
# Take a look at the normalized data from the previous assignment
knitr::kable(normalized_counts[1:5, 1:5], type="html")
| UC_cIN_R1 | UC_cIN_R2 | UC_cIN_R3 | UC_cIN_R4 | AP_cIN_R1 | |
|---|---|---|---|---|---|
| A1BG | 0.7720147 | 0.389810 | 1.0782781 | 0.4301998 | 0.7493442 |
| A1BG-AS1 | 4.7864913 | 4.911606 | 4.8522515 | 5.1193774 | 5.3952779 |
| A2M | 40.3763699 | 57.263093 | 20.7029399 | 34.8461820 | 20.0324672 |
| A2M-AS1 | 2.0458390 | 3.352366 | 4.2052847 | 3.8287780 | 0.7493442 |
| A2ML1 | 0.1544029 | 0.194905 | 0.2695695 | 0.2581199 | 0.0999126 |
# Convert it to a number matrix
heatmap_matrix <- as.matrix(normalized_counts)
# Scale and centre each row around the mean
heatmap_matrix <- t(scale(t(heatmap_matrix)))
# Create the heat map
if(min(heatmap_matrix) == 0){
heatmap_col = circlize::colorRamp2(c(0, max(heatmap_matrix)),
c("white", "red"))
} else {
heatmap_col = circlize::colorRamp2(c(min(heatmap_matrix), 0,
max(heatmap_matrix)), c("blue", "white", "red"))
}
current_heatmap <- ComplexHeatmap::Heatmap(as.matrix(heatmap_matrix),
show_row_dend = TRUE, show_column_dend = TRUE,
col=heatmap_col, show_column_names = TRUE, show_row_names = FALSE,
show_heatmap_legend = TRUE)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` arugment by explicitly setting
## TRUE/FALSE to it. Set `ht_opt$message = FALSE` to turn off this
## message.
current_heatmap
Now we want to plot the MDS using limma R package. We can tell there are 3 distinct clusters. The cluster to the top left show the cIN neuron cells which all have similar logFC dim1 and dim2 scores. The cluster to the top right shows the affected proband, unaffected mother and intermediate sibling cExN cells, which are distinct from the unaffected control cExN cells. This is useful because we want to see analyze the gene expression in cells from patients with autism syndrome disorder or their relatives.
limma::plotMDS(heatmap_matrix, col = rep(c("darkgreen", "blue"), 10))
To see the data in a different perspective, we can colour by patient and do another MDS plot. We can see that the clustering for patients is better in the cIN cells than the cExN cells. Once again we see that there is significant difference between the gene expression of UC cExN cells and the gene expression of the other 3 patient cells.
pat_colors <- rainbow(4)
pat_colors <- unlist(lapply(pat_colors,FUN=function(x){rep(x,4)}))
limma::plotMDS(heatmap_matrix, col = pat_colors)
For the next step, we want to create a model matrix and fit the data we have into it. We will focus on using the patient column for analysis.
# Set model to patients
model_design <- model.matrix(~ samples$patients)
knitr::kable(model_design, type="html")
| (Intercept) | samples\(patientsIS| samples\)patientsUC | samples$patientsUM | |
|---|---|---|---|
| 1 | 0 | 1 | 0 |
| 1 | 0 | 1 | 0 |
| 1 | 0 | 1 | 0 |
| 1 | 0 | 1 | 0 |
| 1 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 |
| 1 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 |
| 1 | 0 | 1 | 0 |
| 1 | 0 | 1 | 0 |
| 1 | 0 | 1 | 0 |
| 1 | 0 | 1 | 0 |
| 1 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 |
| 1 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 |
After, we can construct a expression matrix and minimal set from the normalized counts.
# There was an error citing duplicate row names. We will use make.names to make them unique.
rownames(normalized_counts) = make.names(rownames(normalized_counts), unique=TRUE)
# Calculate expression matrix and minimal set
expression_matrix <- as.matrix(normalized_counts)
minimal_set <- Biobase::ExpressionSet(assayData=expression_matrix)
Then fit the data to the model and apply empirical Bayes to calculate P values.
fit <- limma::lmFit(minimal_set, model_design)
fit2 <- limma::eBayes(fit,trend=TRUE)
topfit <- limma::topTable(fit2,
coef=ncol(model_design),
adjust.method = "BH",
number = nrow(expression_matrix))
output_hits <- merge(rownames(normalized_counts),
topfit,
by.y=0, by.x=1,
all.y=TRUE)
output_hits <- output_hits[order(output_hits$P.Value),]
# Take a look at the top 10 hits
knitr::kable(output_hits[1:10,], type="html")
| x | logFC | AveExpr | t | P.Value | adj.P.Val | B | |
|---|---|---|---|---|---|---|---|
| 17985 | ZNF257 | 7.113438 | 1.8143439 | 32.22859 | 0 | 0 | 19.84025 |
| 17948 | ZNF208 | 6.565174 | 1.7038129 | 31.64309 | 0 | 0 | 19.77352 |
| 18384 | ZNF835 | 10.623805 | 4.0443022 | 25.35091 | 0 | 0 | 18.79089 |
| 7320 | GAPDHP35 | 4.685038 | 1.3709558 | 22.47889 | 0 | 0 | 18.10365 |
| 4555 | CDYL2 | -30.850848 | 27.4617530 | -22.33288 | 0 | 0 | 18.06297 |
| 133 | AC003973.3 | 3.198164 | 0.8114953 | 18.97549 | 0 | 0 | 16.92398 |
| 2426 | AL365357.1 | 22.052102 | 12.6921468 | 18.51508 | 0 | 0 | 16.73133 |
| 5454 | CYP4F29P | -2.319478 | 0.6142981 | -18.09176 | 0 | 0 | 16.54480 |
| 6235 | EIF1AY | -39.206172 | 10.4449541 | -17.81348 | 0 | 0 | 16.41701 |
| 1826 | ADORA2B | 4.099782 | 3.5184098 | 17.26614 | 0 | 0 | 16.15294 |
From the results we can see that 2499 genes had a P value < 0.05.
length(which(output_hits$P.Value < 0.05))
## [1] 2499
Once corrected, that number goes down to 714 genes.
simple_adj_p_values <- length(which(output_hits$adj.P.Val < 0.05))
simple_adj_p_values
## [1] 714
Let’s graph the p-values to see if we have any potential problems. The graph shows that we have a set of good p-values. Now we can apply multiple hypothesis test correction
ggplot2::ggplot(output_hits, ggplot2::aes(x=P.Value)) +
ggplot2::geom_histogram() +
ggplot2::labs(title="P values", y="counts", x="p-value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We now want to improve our results and account for the variability seen in different patients and cell types. We’ll create a design matrix that accounts for both cell type and patient.
model_design_ct <- model.matrix(~ samples$cell_type + samples$patients)
knitr::kable(model_design_ct, type="html")
| (Intercept) | samples\(cell_typecIN| samples\)patientsIS | samples\(patientsUC| samples\)patientsUM | ||
|---|---|---|---|---|
| 1 | 1 | 0 | 1 | 0 |
| 1 | 1 | 0 | 1 | 0 |
| 1 | 1 | 0 | 1 | 0 |
| 1 | 1 | 0 | 1 | 0 |
| 1 | 1 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 |
| 1 | 1 | 1 | 0 | 0 |
| 1 | 1 | 1 | 0 | 0 |
| 1 | 1 | 1 | 0 | 0 |
| 1 | 1 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 | 1 |
| 1 | 1 | 0 | 0 | 1 |
| 1 | 1 | 0 | 0 | 1 |
| 1 | 1 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 |
| 1 | 0 | 1 | 0 | 0 |
| 1 | 0 | 1 | 0 | 0 |
| 1 | 0 | 1 | 0 | 0 |
| 1 | 0 | 1 | 0 | 0 |
| 1 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 0 | 1 |
Now we repeat the previous step of fitting the data to our new model.
fit_ct <- limma::lmFit(minimal_set, model_design_ct)
fit_ct2 <- limma::eBayes(fit_ct,trend=TRUE)
topfit_ct <- limma::topTable(fit_ct2,
coef=ncol(model_design_ct),
adjust.method = "BH",
number = nrow(expression_matrix))
output_hits_ct <- merge(rownames(normalized_counts),
topfit_ct,
by.y=0, by.x=1,
all.y=TRUE)
output_hits_ct <- output_hits_ct[order(output_hits_ct$P.Value),]
# Take a look at the top 10 hits
knitr::kable(output_hits_ct[1:10,], type="html")
| x | logFC | AveExpr | t | P.Value | adj.P.Val | B | |
|---|---|---|---|---|---|---|---|
| 17985 | ZNF257 | 7.113438 | 1.8143439 | 32.36326 | 0 | 0 | 28.05109 |
| 17948 | ZNF208 | 6.565174 | 1.7038129 | 31.91476 | 0 | 0 | 27.95467 |
| 18384 | ZNF835 | 10.623805 | 4.0443022 | 30.20039 | 0 | 0 | 27.55387 |
| 7320 | GAPDHP35 | 4.685038 | 1.3709558 | 24.23032 | 0 | 0 | 25.62799 |
| 4555 | CDYL2 | -30.850848 | 27.4617530 | -23.89015 | 0 | 0 | 25.48564 |
| 2426 | AL365357.1 | 22.052102 | 12.6921468 | 19.78701 | 0 | 0 | 23.36693 |
| 1895 | AGBL4 | 5.116940 | 2.3194307 | 19.31923 | 0 | 0 | 23.06879 |
| 133 | AC003973.3 | 3.198164 | 0.8114953 | 19.19195 | 0 | 0 | 22.98529 |
| 13022 | PTPRN2 | -56.016087 | 40.4856521 | -19.12102 | 0 | 0 | 22.93830 |
| 1284 | AC104692.2 | 1.721314 | 0.6920458 | 18.71185 | 0 | 0 | 22.66078 |
Again take a look at the p-values. From the results we can see that 5682 genes had a P value < 0.05.
length(which(output_hits_ct$P.Value < 0.05))
## [1] 5682
Once corrected, that number goes down to 3171 genes.
complex_adj_p_values <- length(which(output_hits_ct$adj.P.Val < 0.05))
complex_adj_p_values
## [1] 3171
Now I want to compare the two models I have.
simple_model_pvalues <- data.frame(gene_id = output_hits$x, simple_pvalue = output_hits$P.Value)
complex_model_pvalues <- data.frame(gene_id = output_hits_ct$x, complex_pvalue = output_hits_ct$P.Value)
two_models_pvalues <- merge(simple_model_pvalues, complex_model_pvalues, by.x=1, by.y=1)
two_models_pvalues$colour <- "black"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue<0.05] <- "orange"
two_models_pvalues$colour[two_models_pvalues$complex_pvalue<0.05] <- "blue"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue<0.05 & two_models_pvalues$complex_pvalue<0.05] <- "red"
plot(two_models_pvalues$simple_pvalue, two_models_pvalues$complex_pvalue, col = two_models_pvalues$colour, xlab = "simple model p-values", ylab ="Complex model p-values", main="Simple vs Complex Limma")
legend("topleft", inset=0.1, legend=c("Simple", "Complex", "Both", "Not Significant"),
fill = c("orange", "blue", "red", "black"))
In the above plot, we can see that the simple model which considers patients only has a better fit to the p-values of both models.
We want to include the FMR1 gene in the plot which is an important gene for autism syndrome disorder.
two_models_pvalues$colour <- "grey"
two_models_pvalues$colour[two_models_pvalues$gene_id == "FMR1"] <- "red"
plot(two_models_pvalues$simple_pvalue, two_models_pvalues$complex_pvalue, col = two_models_pvalues$colour, xlab = "simple model p-values", ylab ="Complex model p-values", main="Simple vs Complex Limma")
points(two_models_pvalues[which(two_models_pvalues$gene_id == "CHCHD2"), 2:3],pch=20, col="red", cex=1.5 )
legend("topleft", inset=0.1, legend=c("FMR1", "rest"),
fill = c("red", "grey"), cex=0.7)
To see differentially expressed genes, we use a MA plot.
volcano_plot <- cbind(output_hits$logFC, -log10(output_hits$adj.P.Val))
colnames(volcano_plot) <- c("logFC", "-log10 of adj. P value")
point.col <- ifelse(output_hits$logFC < 0, "red", "blue")
plot(volcano_plot, pch = 8, col = point.col, cex = 0.5, main = "logFC vs adjusted p value Graph ")
The volcano graph above shows the genes that are upregulated in blue and downregulated in red. It tells us the distribution of the upregulated vs downregulated.
There were 714 genes that were differentially expressed for the simple model considering patient type. There were 3171 genes that were differentially expressed for complex model considering both patien type and cell type. I used the p-value of 0.05 since that is the most common p-value widely accepted.
I chose to use the Benjamini-Hochberg method because it is straightforward and intuitive to use. As well since there are over 20,000 genes to consider, the false discovery rate can cause many false positives. The Benjamini-Hochberg test helps with reducing false positive rate, it in turns help reeduces the number of false positives in my model.